In [2]:
import pandas as pd
import requests, zipfile, io, re, nltk
from datetime import datetime, timedelta
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from keras import models, layers
from keras.preprocessing.text import Tokenizer 
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Simple Kfold
def kfold(network, neg_tweets, pos_tweets, batch_size, epochs=25):
    kf = KFold(n_splits=5)
    n = 0
    fold_times = []
    histories = []
    if len(neg_tweets) > len(pos_tweets):
        tweets = pos_tweets 
    else: tweets = neg_tweets
    for train_idx, test_idx in kf.split(tweets):
        start = datetime.now()
        print(f"Fold {n}")
        n+=1
        print(f"Test Index Start:{test_idx[0]}")
        print(f"Test Set Size:{len(test_idx)}")
        train_tweets = np.append(neg_tweets[train_idx], pos_tweets[train_idx], axis=0)
        train_labels = np.append(np.full(len(train_idx), 0), np.full(len(train_idx), 1))

        test_tweets = np.append(neg_tweets[test_idx], pos_tweets[test_idx], axis=0)
        test_labels = np.append(np.full(len(test_idx), 0), np.full(len(test_idx), 1))
        validation_data = (test_tweets, test_labels)
        
        history = network.fit(train_tweets, train_labels, batch_size=batch_size, validation_data=validation_data, 
                              epochs=epochs, verbose=0, use_multiprocessing=True)
        histories.append(history)
        summarize_diagnostics(history=history)
        fold_time = datetime.now()-start
        fold_times.append(fold_time.total_seconds())
        print("fold time: ", fold_time)
    summarize_diagnostics(histories=histories)
    print(timedelta(seconds=np.average(fold_times)))

# Plots results to graph
def summarize_diagnostics(history=None, histories=None):
    # plot loss
    plt.subplot(211)
    plt.tight_layout()
    plt.title('Cross Entropy Loss')
    if history:
        plt.plot(history.history['loss'], color='blue', label='train')
        plt.plot(history.history['val_loss'], color='orange', label='test')
    else:
        plt.plot(np.mean([history.history['loss'] for history in histories], axis=0), color='blue', label='train')
        plt.plot(np.mean([history.history['val_loss'] for history in histories], axis=0), color='orange', label='test')
    # plot accuracy
    plt.subplot(212)
    plt.title('Classification Accuracy')
    if history:
        plt.plot(history.history['accuracy'], color='blue', label='train')
        plt.plot(history.history['val_accuracy'], color='orange', label='test')
    else:
        plt.plot(np.mean([history.history['accuracy'] for history in histories], axis=0), color='blue', label='train')
        plt.plot(np.mean([history.history['val_accuracy'] for history in histories], axis=0), color='orange', label='test')
    plt.legend()
    plt.show()
[nltk_data] Downloading package stopwords to /home/techn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/techn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/techn/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/techn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/techn/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

Loading Raw Data¶

In [2]:
raw_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None,
                         names=['polarity', 'id', 'date', 'query', 'user', 'tweet'])
raw_df
Out[2]:
polarity id date query user tweet
0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t...
1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton is upset that he can't update his Facebook by ...
2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus @Kenichan I dived many times for the ball. Man...
3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF my whole body feels itchy and like its on fire
4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli @nationwideclass no, it's not behaving at all....
... ... ... ... ... ... ...
1599995 4 2193601966 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY AmandaMarie1028 Just woke up. Having no school is the best fee...
1599996 4 2193601969 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY TheWDBoards TheWDB.com - Very cool to hear old Walt interv...
1599997 4 2193601991 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY bpbabe Are you ready for your MoJo Makeover? Ask me f...
1599998 4 2193602064 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY tinydiamondz Happy 38th Birthday to my boo of alll time!!! ...
1599999 4 2193602129 Tue Jun 16 08:40:50 PDT 2009 NO_QUERY RyanTrevMorris happy #charitytuesday @theNSPCC @SparksCharity...

1600000 rows × 6 columns

Data Preprocessing¶

Removing Unnecessary Data¶

In [3]:
df = raw_df.drop(columns=['id', 'query', 'polarity', 'user', 'date'])
# df['datetime'] = raw_df['date'].apply(lambda x: pd.to_datetime(x.replace('PDT ', '')))
df
Out[3]:
tweet
0 @switchfoot http://twitpic.com/2y1zl - Awww, t...
1 is upset that he can't update his Facebook by ...
2 @Kenichan I dived many times for the ball. Man...
3 my whole body feels itchy and like its on fire
4 @nationwideclass no, it's not behaving at all....
... ...
1599995 Just woke up. Having no school is the best fee...
1599996 TheWDB.com - Very cool to hear old Walt interv...
1599997 Are you ready for your MoJo Makeover? Ask me f...
1599998 Happy 38th Birthday to my boo of alll time!!! ...
1599999 happy #charitytuesday @theNSPCC @SparksCharity...

1600000 rows × 1 columns

Define and Scale y from {0, 4} to {0, 1}¶

In [4]:
y = raw_df['polarity']
print(f"Unique Elements of raw y: {pd.unique(y)}")
y = y.apply(lambda x: 1 if x==4 else 0)
y
Unique Elements of raw y: [0 4]
Out[4]:
0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: polarity, Length: 1600000, dtype: int64

Remove URLs and @ Handles from Tweets¶

In [5]:
processed_df = df.copy(deep=True)

# Define tokenizer to split tweet into word tokens
tokenizer = RegexpTokenizer(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)|\w+|[^\w\s]+")


# Remove URLs and User Mentions. All Twitter handles must be within 4 to 15 characters
processed_df['tweet'] = processed_df['tweet'].apply(lambda x: re.sub(r"http\S+|@\w{4,15}|#", "", x))
processed_df
Out[5]:
tweet
0 - Awww, that's a bummer. You shoulda got Da...
1 is upset that he can't update his Facebook by ...
2 I dived many times for the ball. Managed to s...
3 my whole body feels itchy and like its on fire
4 no, it's not behaving at all. i'm mad. why am...
... ...
1599995 Just woke up. Having no school is the best fee...
1599996 TheWDB.com - Very cool to hear old Walt interv...
1599997 Are you ready for your MoJo Makeover? Ask me f...
1599998 Happy 38th Birthday to my boo of alll time!!! ...
1599999 happy charitytuesday

1600000 rows × 1 columns

Tokenize Tweets into Sentences¶

In [6]:
processed_df['sentence_tokens'] = processed_df['tweet'].apply(lambda x: sent_tokenize(x))
processed_df
Out[6]:
tweet sentence_tokens
0 - Awww, that's a bummer. You shoulda got Da... [ - Awww, that's a bummer., You shoulda got D...
1 is upset that he can't update his Facebook by ... [is upset that he can't update his Facebook by...
2 I dived many times for the ball. Managed to s... [ I dived many times for the ball., Managed to...
3 my whole body feels itchy and like its on fire [my whole body feels itchy and like its on fire]
4 no, it's not behaving at all. i'm mad. why am... [ no, it's not behaving at all., i'm mad., why...
... ... ...
1599995 Just woke up. Having no school is the best fee... [Just woke up., Having no school is the best f...
1599996 TheWDB.com - Very cool to hear old Walt interv... [TheWDB.com - Very cool to hear old Walt inter...
1599997 Are you ready for your MoJo Makeover? Ask me f... [Are you ready for your MoJo Makeover?, Ask me...
1599998 Happy 38th Birthday to my boo of alll time!!! ... [Happy 38th Birthday to my boo of alll time!!!...
1599999 happy charitytuesday [happy charitytuesday]

1600000 rows × 2 columns

Extract Part-Of-Speech Tags¶

In [7]:
processed_df['pos_tags'] = processed_df['sentence_tokens'].apply(lambda x: [nltk.pos_tag(tokenizer.tokenize(sent)) for sent in x])
processed_df
Out[7]:
tweet sentence_tokens pos_tags
0 - Awww, that's a bummer. You shoulda got Da... [ - Awww, that's a bummer., You shoulda got D... [[(-, :), (Awww, NN), (,, ,), (that, IN), (', ...
1 is upset that he can't update his Facebook by ... [is upset that he can't update his Facebook by... [[(is, VBZ), (upset, JJ), (that, IN), (he, PRP...
2 I dived many times for the ball. Managed to s... [ I dived many times for the ball., Managed to... [[(I, PRP), (dived, VBD), (many, JJ), (times, ...
3 my whole body feels itchy and like its on fire [my whole body feels itchy and like its on fire] [[(my, PRP$), (whole, JJ), (body, NN), (feels,...
4 no, it's not behaving at all. i'm mad. why am... [ no, it's not behaving at all., i'm mad., why... [[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB...
... ... ... ...
1599995 Just woke up. Having no school is the best fee... [Just woke up., Having no school is the best f... [[(Just, RB), (woke, VBD), (up, RP), (., .)], ...
1599996 TheWDB.com - Very cool to hear old Walt interv... [TheWDB.com - Very cool to hear old Walt inter... [[(TheWDB, NNP), (., .), (com, NN), (-, :), (V...
1599997 Are you ready for your MoJo Makeover? Ask me f... [Are you ready for your MoJo Makeover?, Ask me... [[(Are, NNP), (you, PRP), (ready, JJ), (for, I...
1599998 Happy 38th Birthday to my boo of alll time!!! ... [Happy 38th Birthday to my boo of alll time!!!... [[(Happy, JJ), (38th, CD), (Birthday, NN), (to...
1599999 happy charitytuesday [happy charitytuesday] [[(happy, JJ), (charitytuesday, NN)]]

1600000 rows × 3 columns

Create word tokens from tweets¶

In [8]:
# pattern = regex.compile(r"(.)/\1{2,}")
# pattern.sub(r"\1\1\1", text)
processed_df['word_tokens'] = processed_df['tweet'].apply(lambda x: tokenizer.tokenize(x))
processed_df
Out[8]:
tweet sentence_tokens pos_tags word_tokens
0 - Awww, that's a bummer. You shoulda got Da... [ - Awww, that's a bummer., You shoulda got D... [[(-, :), (Awww, NN), (,, ,), (that, IN), (', ... [-, Awww, ,, that, ', s, a, bummer, ., You, sh...
1 is upset that he can't update his Facebook by ... [is upset that he can't update his Facebook by... [[(is, VBZ), (upset, JJ), (that, IN), (he, PRP... [is, upset, that, he, can, ', t, update, his, ...
2 I dived many times for the ball. Managed to s... [ I dived many times for the ball., Managed to... [[(I, PRP), (dived, VBD), (many, JJ), (times, ... [I, dived, many, times, for, the, ball, ., Man...
3 my whole body feels itchy and like its on fire [my whole body feels itchy and like its on fire] [[(my, PRP$), (whole, JJ), (body, NN), (feels,... [my, whole, body, feels, itchy, and, like, its...
4 no, it's not behaving at all. i'm mad. why am... [ no, it's not behaving at all., i'm mad., why... [[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB... [no, ,, it, ', s, not, behaving, at, all, ., i...
... ... ... ... ...
1599995 Just woke up. Having no school is the best fee... [Just woke up., Having no school is the best f... [[(Just, RB), (woke, VBD), (up, RP), (., .)], ... [Just, woke, up, ., Having, no, school, is, th...
1599996 TheWDB.com - Very cool to hear old Walt interv... [TheWDB.com - Very cool to hear old Walt inter... [[(TheWDB, NNP), (., .), (com, NN), (-, :), (V... [TheWDB, ., com, -, Very, cool, to, hear, old,...
1599997 Are you ready for your MoJo Makeover? Ask me f... [Are you ready for your MoJo Makeover?, Ask me... [[(Are, NNP), (you, PRP), (ready, JJ), (for, I... [Are, you, ready, for, your, MoJo, Makeover, ?...
1599998 Happy 38th Birthday to my boo of alll time!!! ... [Happy 38th Birthday to my boo of alll time!!!... [[(Happy, JJ), (38th, CD), (Birthday, NN), (to... [Happy, 38th, Birthday, to, my, boo, of, alll,...
1599999 happy charitytuesday [happy charitytuesday] [[(happy, JJ), (charitytuesday, NN)]] [happy, charitytuesday]

1600000 rows × 4 columns

Generate and Remove Stop Words from Word Tokens¶

In [9]:
# Generate stop words
print("Stop Words: ", stopwords.words('english'))
stop_words = set(stopwords.words('english'))
# Remove stop words from tokenized tweets
processed_df['word_tokens_no_stop_words'] = processed_df['word_tokens'].apply(lambda x: [w for w in x if not w.lower() in stop_words])
processed_df
Stop Words:  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
Out[9]:
tweet sentence_tokens pos_tags word_tokens word_tokens_no_stop_words
0 - Awww, that's a bummer. You shoulda got Da... [ - Awww, that's a bummer., You shoulda got D... [[(-, :), (Awww, NN), (,, ,), (that, IN), (', ... [-, Awww, ,, that, ', s, a, bummer, ., You, sh... [-, Awww, ,, ', bummer, ., shoulda, got, David...
1 is upset that he can't update his Facebook by ... [is upset that he can't update his Facebook by... [[(is, VBZ), (upset, JJ), (that, IN), (he, PRP... [is, upset, that, he, can, ', t, update, his, ... [upset, ', update, Facebook, texting, ..., mig...
2 I dived many times for the ball. Managed to s... [ I dived many times for the ball., Managed to... [[(I, PRP), (dived, VBD), (many, JJ), (times, ... [I, dived, many, times, for, the, ball, ., Man... [dived, many, times, ball, ., Managed, save, 5...
3 my whole body feels itchy and like its on fire [my whole body feels itchy and like its on fire] [[(my, PRP$), (whole, JJ), (body, NN), (feels,... [my, whole, body, feels, itchy, and, like, its... [whole, body, feels, itchy, like, fire]
4 no, it's not behaving at all. i'm mad. why am... [ no, it's not behaving at all., i'm mad., why... [[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB... [no, ,, it, ', s, not, behaving, at, all, ., i... [,, ', behaving, ., ', mad, ., ?, ', see, .]
... ... ... ... ... ...
1599995 Just woke up. Having no school is the best fee... [Just woke up., Having no school is the best f... [[(Just, RB), (woke, VBD), (up, RP), (., .)], ... [Just, woke, up, ., Having, no, school, is, th... [woke, ., school, best, feeling, ever]
1599996 TheWDB.com - Very cool to hear old Walt interv... [TheWDB.com - Very cool to hear old Walt inter... [[(TheWDB, NNP), (., .), (com, NN), (-, :), (V... [TheWDB, ., com, -, Very, cool, to, hear, old,... [TheWDB, ., com, -, cool, hear, old, Walt, int...
1599997 Are you ready for your MoJo Makeover? Ask me f... [Are you ready for your MoJo Makeover?, Ask me... [[(Are, NNP), (you, PRP), (ready, JJ), (for, I... [Are, you, ready, for, your, MoJo, Makeover, ?... [ready, MoJo, Makeover, ?, Ask, details]
1599998 Happy 38th Birthday to my boo of alll time!!! ... [Happy 38th Birthday to my boo of alll time!!!... [[(Happy, JJ), (38th, CD), (Birthday, NN), (to... [Happy, 38th, Birthday, to, my, boo, of, alll,... [Happy, 38th, Birthday, boo, alll, time, !!!, ...
1599999 happy charitytuesday [happy charitytuesday] [[(happy, JJ), (charitytuesday, NN)]] [happy, charitytuesday] [happy, charitytuesday]

1600000 rows × 5 columns

Stemming Words in Tweet¶

In [10]:
ps = PorterStemmer()
processed_df['word_tokens_no_stop_stemmed'] = processed_df['word_tokens_no_stop_words'].apply(lambda x: [ps.stem(w) for w in x if not w == ''])
processed_df
Out[10]:
tweet sentence_tokens pos_tags word_tokens word_tokens_no_stop_words word_tokens_no_stop_stemmed
0 - Awww, that's a bummer. You shoulda got Da... [ - Awww, that's a bummer., You shoulda got D... [[(-, :), (Awww, NN), (,, ,), (that, IN), (', ... [-, Awww, ,, that, ', s, a, bummer, ., You, sh... [-, Awww, ,, ', bummer, ., shoulda, got, David... [-, awww, ,, ', bummer, ., shoulda, got, david...
1 is upset that he can't update his Facebook by ... [is upset that he can't update his Facebook by... [[(is, VBZ), (upset, JJ), (that, IN), (he, PRP... [is, upset, that, he, can, ', t, update, his, ... [upset, ', update, Facebook, texting, ..., mig... [upset, ', updat, facebook, text, ..., might, ...
2 I dived many times for the ball. Managed to s... [ I dived many times for the ball., Managed to... [[(I, PRP), (dived, VBD), (many, JJ), (times, ... [I, dived, many, times, for, the, ball, ., Man... [dived, many, times, ball, ., Managed, save, 5... [dive, mani, time, ball, ., manag, save, 50, %...
3 my whole body feels itchy and like its on fire [my whole body feels itchy and like its on fire] [[(my, PRP$), (whole, JJ), (body, NN), (feels,... [my, whole, body, feels, itchy, and, like, its... [whole, body, feels, itchy, like, fire] [whole, bodi, feel, itchi, like, fire]
4 no, it's not behaving at all. i'm mad. why am... [ no, it's not behaving at all., i'm mad., why... [[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB... [no, ,, it, ', s, not, behaving, at, all, ., i... [,, ', behaving, ., ', mad, ., ?, ', see, .] [,, ', behav, ., ', mad, ., ?, ', see, .]
... ... ... ... ... ... ...
1599995 Just woke up. Having no school is the best fee... [Just woke up., Having no school is the best f... [[(Just, RB), (woke, VBD), (up, RP), (., .)], ... [Just, woke, up, ., Having, no, school, is, th... [woke, ., school, best, feeling, ever] [woke, ., school, best, feel, ever]
1599996 TheWDB.com - Very cool to hear old Walt interv... [TheWDB.com - Very cool to hear old Walt inter... [[(TheWDB, NNP), (., .), (com, NN), (-, :), (V... [TheWDB, ., com, -, Very, cool, to, hear, old,... [TheWDB, ., com, -, cool, hear, old, Walt, int... [thewdb, ., com, -, cool, hear, old, walt, int...
1599997 Are you ready for your MoJo Makeover? Ask me f... [Are you ready for your MoJo Makeover?, Ask me... [[(Are, NNP), (you, PRP), (ready, JJ), (for, I... [Are, you, ready, for, your, MoJo, Makeover, ?... [ready, MoJo, Makeover, ?, Ask, details] [readi, mojo, makeov, ?, ask, detail]
1599998 Happy 38th Birthday to my boo of alll time!!! ... [Happy 38th Birthday to my boo of alll time!!!... [[(Happy, JJ), (38th, CD), (Birthday, NN), (to... [Happy, 38th, Birthday, to, my, boo, of, alll,... [Happy, 38th, Birthday, boo, alll, time, !!!, ... [happi, 38th, birthday, boo, alll, time, !!!, ...
1599999 happy charitytuesday [happy charitytuesday] [[(happy, JJ), (charitytuesday, NN)]] [happy, charitytuesday] [happy, charitytuesday] [happi, charitytuesday]

1600000 rows × 6 columns

Remove Stop Words and Lemmatize Words in Tweets¶

In [11]:
# Map POS tags from pos tagger to tags accepted by WordNetLemmatizer
# Source: https://github.com/nltk/nltk/blob/develop/nltk/stem/wordnet.py
def penn2morphy(penntag) -> str:
    """
    Converts tags from Penn format (input: single string) to Morphy.
    """
    morphy_tag = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return None

# Adjust POS tags based on mappings and remove stop words
processed_df['pos_tags_adjusted_no_stop_words'] = processed_df['pos_tags'].apply(
    lambda x: [(w[0], penn2morphy(w[1])) for s in x for w in s if not w[0].lower() in stop_words])

# Lemmatizing words in Tweet
lemmatizer = WordNetLemmatizer()
processed_df['word_tokens_no_stop_lemmatized'] = processed_df['pos_tags_adjusted_no_stop_words'].apply(lambda x: [lemmatizer.lemmatize(word=w[0].lower(), pos=w[1]) if w[1] != None else w[0].lower() for w in x])
processed_df
Out[11]:
tweet sentence_tokens pos_tags word_tokens word_tokens_no_stop_words word_tokens_no_stop_stemmed pos_tags_adjusted_no_stop_words word_tokens_no_stop_lemmatized
0 - Awww, that's a bummer. You shoulda got Da... [ - Awww, that's a bummer., You shoulda got D... [[(-, :), (Awww, NN), (,, ,), (that, IN), (', ... [-, Awww, ,, that, ', s, a, bummer, ., You, sh... [-, Awww, ,, ', bummer, ., shoulda, got, David... [-, awww, ,, ', bummer, ., shoulda, got, david... [(-, None), (Awww, n), (,, None), (', None), (... [-, awww, ,, ', bummer, ., shoulda, get, david...
1 is upset that he can't update his Facebook by ... [is upset that he can't update his Facebook by... [[(is, VBZ), (upset, JJ), (that, IN), (he, PRP... [is, upset, that, he, can, ', t, update, his, ... [upset, ', update, Facebook, texting, ..., mig... [upset, ', updat, facebook, text, ..., might, ... [(upset, a), (', None), (update, v), (Facebook... [upset, ', update, facebook, texting, ..., mig...
2 I dived many times for the ball. Managed to s... [ I dived many times for the ball., Managed to... [[(I, PRP), (dived, VBD), (many, JJ), (times, ... [I, dived, many, times, for, the, ball, ., Man... [dived, many, times, ball, ., Managed, save, 5... [dive, mani, time, ball, ., manag, save, 50, %... [(dived, v), (many, a), (times, n), (ball, n),... [dive, many, time, ball, ., manage, save, 50, ...
3 my whole body feels itchy and like its on fire [my whole body feels itchy and like its on fire] [[(my, PRP$), (whole, JJ), (body, NN), (feels,... [my, whole, body, feels, itchy, and, like, its... [whole, body, feels, itchy, like, fire] [whole, bodi, feel, itchi, like, fire] [(whole, a), (body, n), (feels, n), (itchy, v)... [whole, body, feel, itchy, like, fire]
4 no, it's not behaving at all. i'm mad. why am... [ no, it's not behaving at all., i'm mad., why... [[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB... [no, ,, it, ', s, not, behaving, at, all, ., i... [,, ', behaving, ., ', mad, ., ?, ', see, .] [,, ', behav, ., ', mad, ., ?, ', see, .] [(,, None), (', None), (behaving, v), (., None... [,, ', behave, ., ', mad, ., ?, ', see, .]
... ... ... ... ... ... ... ... ...
1599995 Just woke up. Having no school is the best fee... [Just woke up., Having no school is the best f... [[(Just, RB), (woke, VBD), (up, RP), (., .)], ... [Just, woke, up, ., Having, no, school, is, th... [woke, ., school, best, feeling, ever] [woke, ., school, best, feel, ever] [(woke, v), (., None), (school, n), (best, a),... [wake, ., school, best, feeling, ever]
1599996 TheWDB.com - Very cool to hear old Walt interv... [TheWDB.com - Very cool to hear old Walt inter... [[(TheWDB, NNP), (., .), (com, NN), (-, :), (V... [TheWDB, ., com, -, Very, cool, to, hear, old,... [TheWDB, ., com, -, cool, hear, old, Walt, int... [thewdb, ., com, -, cool, hear, old, walt, int... [(TheWDB, n), (., None), (com, n), (-, None), ... [thewdb, ., com, -, cool, hear, old, walt, int...
1599997 Are you ready for your MoJo Makeover? Ask me f... [Are you ready for your MoJo Makeover?, Ask me... [[(Are, NNP), (you, PRP), (ready, JJ), (for, I... [Are, you, ready, for, your, MoJo, Makeover, ?... [ready, MoJo, Makeover, ?, Ask, details] [readi, mojo, makeov, ?, ask, detail] [(ready, a), (MoJo, n), (Makeover, n), (?, Non... [ready, mojo, makeover, ?, ask, detail]
1599998 Happy 38th Birthday to my boo of alll time!!! ... [Happy 38th Birthday to my boo of alll time!!!... [[(Happy, JJ), (38th, CD), (Birthday, NN), (to... [Happy, 38th, Birthday, to, my, boo, of, alll,... [Happy, 38th, Birthday, boo, alll, time, !!!, ... [happi, 38th, birthday, boo, alll, time, !!!, ... [(Happy, a), (38th, None), (Birthday, n), (boo... [happy, 38th, birthday, boo, alll, time, !!!, ...
1599999 happy charitytuesday [happy charitytuesday] [[(happy, JJ), (charitytuesday, NN)]] [happy, charitytuesday] [happy, charitytuesday] [happi, charitytuesday] [(happy, a), (charitytuesday, n)] [happy, charitytuesday]

1600000 rows × 8 columns

Strip Punctuation¶

In [12]:
import string
processed_df['tweet_str_no_stop_stemmed'] = processed_df['word_tokens_no_stop_stemmed'].apply(lambda x: ' '.join(x).translate(str.maketrans('', '', string.punctuation)))
processed_df['tweet_str_no_stop_lemmatized'] = processed_df['word_tokens_no_stop_lemmatized'].apply(lambda x: ' '.join(x).translate(str.maketrans('', '', string.punctuation)))
processed_df
Out[12]:
tweet sentence_tokens pos_tags word_tokens word_tokens_no_stop_words word_tokens_no_stop_stemmed pos_tags_adjusted_no_stop_words word_tokens_no_stop_lemmatized tweet_str_no_stop_stemmed tweet_str_no_stop_lemmatized
0 - Awww, that's a bummer. You shoulda got Da... [ - Awww, that's a bummer., You shoulda got D... [[(-, :), (Awww, NN), (,, ,), (that, IN), (', ... [-, Awww, ,, that, ', s, a, bummer, ., You, sh... [-, Awww, ,, ', bummer, ., shoulda, got, David... [-, awww, ,, ', bummer, ., shoulda, got, david... [(-, None), (Awww, n), (,, None), (', None), (... [-, awww, ,, ', bummer, ., shoulda, get, david... awww bummer shoulda got david carr third d... awww bummer shoulda get david carr third d...
1 is upset that he can't update his Facebook by ... [is upset that he can't update his Facebook by... [[(is, VBZ), (upset, JJ), (that, IN), (he, PRP... [is, upset, that, he, can, ', t, update, his, ... [upset, ', update, Facebook, texting, ..., mig... [upset, ', updat, facebook, text, ..., might, ... [(upset, a), (', None), (update, v), (Facebook... [upset, ', update, facebook, texting, ..., mig... upset updat facebook text might cri result s... upset update facebook texting might cry resu...
2 I dived many times for the ball. Managed to s... [ I dived many times for the ball., Managed to... [[(I, PRP), (dived, VBD), (many, JJ), (times, ... [I, dived, many, times, for, the, ball, ., Man... [dived, many, times, ball, ., Managed, save, 5... [dive, mani, time, ball, ., manag, save, 50, %... [(dived, v), (many, a), (times, n), (ball, n),... [dive, many, time, ball, ., manage, save, 50, ... dive mani time ball manag save 50 rest go bound dive many time ball manage save 50 rest go b...
3 my whole body feels itchy and like its on fire [my whole body feels itchy and like its on fire] [[(my, PRP$), (whole, JJ), (body, NN), (feels,... [my, whole, body, feels, itchy, and, like, its... [whole, body, feels, itchy, like, fire] [whole, bodi, feel, itchi, like, fire] [(whole, a), (body, n), (feels, n), (itchy, v)... [whole, body, feel, itchy, like, fire] whole bodi feel itchi like fire whole body feel itchy like fire
4 no, it's not behaving at all. i'm mad. why am... [ no, it's not behaving at all., i'm mad., why... [[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB... [no, ,, it, ', s, not, behaving, at, all, ., i... [,, ', behaving, ., ', mad, ., ?, ', see, .] [,, ', behav, ., ', mad, ., ?, ', see, .] [(,, None), (', None), (behaving, v), (., None... [,, ', behave, ., ', mad, ., ?, ', see, .] behav mad see behave mad see
... ... ... ... ... ... ... ... ... ... ...
1599995 Just woke up. Having no school is the best fee... [Just woke up., Having no school is the best f... [[(Just, RB), (woke, VBD), (up, RP), (., .)], ... [Just, woke, up, ., Having, no, school, is, th... [woke, ., school, best, feeling, ever] [woke, ., school, best, feel, ever] [(woke, v), (., None), (school, n), (best, a),... [wake, ., school, best, feeling, ever] woke school best feel ever wake school best feeling ever
1599996 TheWDB.com - Very cool to hear old Walt interv... [TheWDB.com - Very cool to hear old Walt inter... [[(TheWDB, NNP), (., .), (com, NN), (-, :), (V... [TheWDB, ., com, -, Very, cool, to, hear, old,... [TheWDB, ., com, -, cool, hear, old, Walt, int... [thewdb, ., com, -, cool, hear, old, walt, int... [(TheWDB, n), (., None), (com, n), (-, None), ... [thewdb, ., com, -, cool, hear, old, walt, int... thewdb com cool hear old walt interview ⠙« thewdb com cool hear old walt interview ⠙«
1599997 Are you ready for your MoJo Makeover? Ask me f... [Are you ready for your MoJo Makeover?, Ask me... [[(Are, NNP), (you, PRP), (ready, JJ), (for, I... [Are, you, ready, for, your, MoJo, Makeover, ?... [ready, MoJo, Makeover, ?, Ask, details] [readi, mojo, makeov, ?, ask, detail] [(ready, a), (MoJo, n), (Makeover, n), (?, Non... [ready, mojo, makeover, ?, ask, detail] readi mojo makeov ask detail ready mojo makeover ask detail
1599998 Happy 38th Birthday to my boo of alll time!!! ... [Happy 38th Birthday to my boo of alll time!!!... [[(Happy, JJ), (38th, CD), (Birthday, NN), (to... [Happy, 38th, Birthday, to, my, boo, of, alll,... [Happy, 38th, Birthday, boo, alll, time, !!!, ... [happi, 38th, birthday, boo, alll, time, !!!, ... [(Happy, a), (38th, None), (Birthday, n), (boo... [happy, 38th, birthday, boo, alll, time, !!!, ... happi 38th birthday boo alll time tupac amaru... happy 38th birthday boo alll time tupac amaru...
1599999 happy charitytuesday [happy charitytuesday] [[(happy, JJ), (charitytuesday, NN)]] [happy, charitytuesday] [happy, charitytuesday] [happi, charitytuesday] [(happy, a), (charitytuesday, n)] [happy, charitytuesday] happi charitytuesday happy charitytuesday

1600000 rows × 10 columns

Pickle data for later use¶

In [13]:
# Pickle processed_df
y.to_pickle("y.pickle")
processed_df.to_pickle("processed_df.pickle")

Load Pickled Data¶

In [3]:
# Read processed_df from pickle file
processed_df = pd.read_pickle("processed_df.pickle")
y = pd.read_pickle("y.pickle")
processed_df
Out[3]:
tweet sentence_tokens pos_tags word_tokens word_tokens_no_stop_words word_tokens_no_stop_stemmed pos_tags_adjusted_no_stop_words word_tokens_no_stop_lemmatized tweet_str_no_stop_stemmed tweet_str_no_stop_lemmatized
0 - Awww, that's a bummer. You shoulda got Da... [ - Awww, that's a bummer., You shoulda got D... [[(-, :), (Awww, NN), (,, ,), (that, IN), (', ... [-, Awww, ,, that, ', s, a, bummer, ., You, sh... [-, Awww, ,, ', bummer, ., shoulda, got, David... [-, awww, ,, ', bummer, ., shoulda, got, david... [(-, None), (Awww, n), (,, None), (', None), (... [-, awww, ,, ', bummer, ., shoulda, get, david... awww bummer shoulda got david carr third d... awww bummer shoulda get david carr third d...
1 is upset that he can't update his Facebook by ... [is upset that he can't update his Facebook by... [[(is, VBZ), (upset, JJ), (that, IN), (he, PRP... [is, upset, that, he, can, ', t, update, his, ... [upset, ', update, Facebook, texting, ..., mig... [upset, ', updat, facebook, text, ..., might, ... [(upset, a), (', None), (update, v), (Facebook... [upset, ', update, facebook, texting, ..., mig... upset updat facebook text might cri result s... upset update facebook texting might cry resu...
2 I dived many times for the ball. Managed to s... [ I dived many times for the ball., Managed to... [[(I, PRP), (dived, VBD), (many, JJ), (times, ... [I, dived, many, times, for, the, ball, ., Man... [dived, many, times, ball, ., Managed, save, 5... [dive, mani, time, ball, ., manag, save, 50, %... [(dived, v), (many, a), (times, n), (ball, n),... [dive, many, time, ball, ., manage, save, 50, ... dive mani time ball manag save 50 rest go bound dive many time ball manage save 50 rest go b...
3 my whole body feels itchy and like its on fire [my whole body feels itchy and like its on fire] [[(my, PRP$), (whole, JJ), (body, NN), (feels,... [my, whole, body, feels, itchy, and, like, its... [whole, body, feels, itchy, like, fire] [whole, bodi, feel, itchi, like, fire] [(whole, a), (body, n), (feels, n), (itchy, v)... [whole, body, feel, itchy, like, fire] whole bodi feel itchi like fire whole body feel itchy like fire
4 no, it's not behaving at all. i'm mad. why am... [ no, it's not behaving at all., i'm mad., why... [[(no, DT), (,, ,), (it, PRP), (', ''), (s, VB... [no, ,, it, ', s, not, behaving, at, all, ., i... [,, ', behaving, ., ', mad, ., ?, ', see, .] [,, ', behav, ., ', mad, ., ?, ', see, .] [(,, None), (', None), (behaving, v), (., None... [,, ', behave, ., ', mad, ., ?, ', see, .] behav mad see behave mad see
... ... ... ... ... ... ... ... ... ... ...
1599995 Just woke up. Having no school is the best fee... [Just woke up., Having no school is the best f... [[(Just, RB), (woke, VBD), (up, RP), (., .)], ... [Just, woke, up, ., Having, no, school, is, th... [woke, ., school, best, feeling, ever] [woke, ., school, best, feel, ever] [(woke, v), (., None), (school, n), (best, a),... [wake, ., school, best, feeling, ever] woke school best feel ever wake school best feeling ever
1599996 TheWDB.com - Very cool to hear old Walt interv... [TheWDB.com - Very cool to hear old Walt inter... [[(TheWDB, NNP), (., .), (com, NN), (-, :), (V... [TheWDB, ., com, -, Very, cool, to, hear, old,... [TheWDB, ., com, -, cool, hear, old, Walt, int... [thewdb, ., com, -, cool, hear, old, walt, int... [(TheWDB, n), (., None), (com, n), (-, None), ... [thewdb, ., com, -, cool, hear, old, walt, int... thewdb com cool hear old walt interview ⠙« thewdb com cool hear old walt interview ⠙«
1599997 Are you ready for your MoJo Makeover? Ask me f... [Are you ready for your MoJo Makeover?, Ask me... [[(Are, NNP), (you, PRP), (ready, JJ), (for, I... [Are, you, ready, for, your, MoJo, Makeover, ?... [ready, MoJo, Makeover, ?, Ask, details] [readi, mojo, makeov, ?, ask, detail] [(ready, a), (MoJo, n), (Makeover, n), (?, Non... [ready, mojo, makeover, ?, ask, detail] readi mojo makeov ask detail ready mojo makeover ask detail
1599998 Happy 38th Birthday to my boo of alll time!!! ... [Happy 38th Birthday to my boo of alll time!!!... [[(Happy, JJ), (38th, CD), (Birthday, NN), (to... [Happy, 38th, Birthday, to, my, boo, of, alll,... [Happy, 38th, Birthday, boo, alll, time, !!!, ... [happi, 38th, birthday, boo, alll, time, !!!, ... [(Happy, a), (38th, None), (Birthday, n), (boo... [happy, 38th, birthday, boo, alll, time, !!!, ... happi 38th birthday boo alll time tupac amaru... happy 38th birthday boo alll time tupac amaru...
1599999 happy charitytuesday [happy charitytuesday] [[(happy, JJ), (charitytuesday, NN)]] [happy, charitytuesday] [happy, charitytuesday] [happi, charitytuesday] [(happy, a), (charitytuesday, n)] [happy, charitytuesday] happi charitytuesday happy charitytuesday

1600000 rows × 10 columns

Model Testing¶

Fully Connected Neural Network¶

Baseline¶

In [16]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(12, input_dim=1000, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(12, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

#Compile the model so it can be trained
model.compile(
     loss='binary_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
model.summary()

cv=CountVectorizer(max_features=1000)
X=cv.fit_transform(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join)).toarray()
neg_tweets_stem = np.array([X[i] for i in range(len(y)) if y[i]==0])[:200000]
pos_tweets_stem = np.array([X[i] for i in range(len(y)) if y[i]==1])[:200000]
print(len(neg_tweets_stem), len(pos_tweets_stem))
kfold(model, neg_tweets_stem, pos_tweets_stem, 256)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 12)                12012     
                                                                 
 dense_1 (Dense)             (None, 8)                 104       
                                                                 
 dense_2 (Dense)             (None, 50)                450       
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 12)                612       
                                                                 
 dense_4 (Dense)             (None, 50)                650       
                                                                 
 dense_5 (Dense)             (None, 1)                 51        
                                                                 
=================================================================
Total params: 13,879
Trainable params: 13,879
Non-trainable params: 0
_________________________________________________________________
2022-05-02 00:23:30.964763: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.033882: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.034665: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.036492: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-02 00:23:31.037316: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.037942: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.038543: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.755184: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.755694: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.756155: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:23:31.757132: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21785 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:0b:00.0, compute capability: 8.6
200000 200000
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-02 00:23:52.741958: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:00:59.181749
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:00:59.530227
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:00:58.109800
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:00:58.512562
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:00:58.714119
0:00:58.809691

Model adjusted with significant measures to prevent overfitting¶

In [10]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, input_dim=1000, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
model.summary()

cv=CountVectorizer(max_features=1000)
X=cv.fit_transform(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join)).toarray()
neg_tweets_stem = np.array([X[i] for i in range(len(y)) if y[i]==0])[:200000]
pos_tweets_stem = np.array([X[i] for i in range(len(y)) if y[i]==1])[:200000]
print(len(neg_tweets_stem), len(pos_tweets_stem))

kfold(model, neg_tweets_stem, pos_tweets_stem, 256)
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_15 (Dense)            (None, 32)                32032     
                                                                 
 dropout_4 (Dropout)         (None, 32)                0         
                                                                 
 dense_16 (Dense)            (None, 1)                 33        
                                                                 
=================================================================
Total params: 32,065
Trainable params: 32,065
Non-trainable params: 0
_________________________________________________________________
200000 200000
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:01:39.026510
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:33.780552
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:33.063937
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:35.238804
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:35.269832
0:01:35.275927

Reduce size of bag of words¶

In [7]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, input_dim=500, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
model.summary()

# Create bag of words featureset for stemmed 
cv=CountVectorizer(max_features=500)
X=cv.fit_transform(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join)).toarray()
neg_tweets_stem = np.array([X[i] for i in range(len(y)) if y[i]==0 and any(X[i])])[:200000]
pos_tweets_stem = np.array([X[i] for i in range(len(y)) if y[i]==1 and any(X[i])])[:200000]
print(len(neg_tweets_stem), len(pos_tweets_stem))
kfold(model, neg_tweets_stem, pos_tweets_stem, 256)
200000 200000
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:01:28.622205
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:32.170163
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:30.157432
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:32.955836
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:35.253310
0:01:31.831789

Increase size of bag of words¶

In [4]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, input_dim=1500, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
model.summary()

cv=CountVectorizer(max_features=1500)
X=cv.fit_transform(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join)).toarray()
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])[:200000]
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])[:200000]
print(len(neg_tweets_lemm), len(pos_tweets_lemm))

kfold(model, neg_tweets_lemm, pos_tweets_lemm, 256)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 32)                48032     
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 48,065
Trainable params: 48,065
Non-trainable params: 0
_________________________________________________________________
200000 200000
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:01:54.382785
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:02:02.181484
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:55.631821
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:51.486260
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:50.994837
0:01:54.935437

Test with TF-IDF¶

In [6]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, input_dim=1500, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_4 (Dense)             (None, 32)                48032     
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 48,065
Trainable params: 48,065
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-02 00:33:51.802676: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:00:46.333092
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:00:44.314895
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:00:44.420734
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:00:44.715829
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:00:44.428026
0:00:44.842515

Testing Other Optimizers¶

SGD¶

In [3]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, input_dim=1500, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = SGD(learning_rate=0.001, momentum=0.9)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256, 50)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 32)                48032     
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 48,065
Trainable params: 48,065
Non-trainable params: 0
_________________________________________________________________
2022-05-02 01:26:44.289392: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:44.359229: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:44.359909: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:44.361586: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-02 01:26:44.362667: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:44.363292: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:44.363885: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:45.099897: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:45.100529: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:45.101092: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 01:26:45.102195: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21694 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:0b:00.0, compute capability: 8.6
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-02 01:27:35.674139: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:01:25.043847
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:23.917364
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:23.963050
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:23.653744
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:23.788485
0:01:24.073298

RMSprop¶

In [4]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, input_dim=1500, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = RMSprop(learning_rate=0.0001)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_2 (Dense)             (None, 32)                48032     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 48,065
Trainable params: 48,065
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:00:48.983851
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:00:48.705771
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:00:49.182019
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:00:48.449143
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:00:48.933646
0:00:48.850886

Testing Best Model With Lemmatized Words¶

In [4]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, input_dim=1500, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_lemmatized'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_lemmatized'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 32)                48032     
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 48,065
Trainable params: 48,065
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:01:38.154631
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:40.635658
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:36.044705
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:33.414954
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:32.492165
0:01:36.148423

Larger Dataset¶

In [4]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, input_dim=1000, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1000)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:400000], pos_tweets_lemm[:400000], 256)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 32)                32032     
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 32,065
Trainable params: 32,065
Non-trainable params: 0
_________________________________________________________________
2022-05-02 04:04:11.481881: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:11.550236: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:11.550941: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:11.552433: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-02 04:04:11.553448: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:11.554093: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:11.554687: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:12.276309: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:12.276830: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:12.277305: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:04:12.278309: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21793 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:0b:00.0, compute capability: 8.6
Fold 0
Test Index Start:0
Test Set Size:80000
2022-05-02 04:05:07.533022: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:01:12.915578
Fold 1
Test Index Start:80000
Test Set Size:80000
fold time:  0:01:12.023179
Fold 2
Test Index Start:160000
Test Set Size:80000
fold time:  0:01:11.853858
Fold 3
Test Index Start:240000
Test Set Size:80000
fold time:  0:01:12.209822
Fold 4
Test Index Start:320000
Test Set Size:80000
fold time:  0:01:12.339642
0:01:12.268416

Convolutional Neural Network¶

CNN 500 Words¶

In [25]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(500, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layears.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])

kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:00:55.740388
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:00:53.801638
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:00:52.847483
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:00:52.699347
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:00:52.744881
0:00:53.566747

CNN 1000 Words¶

In [26]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1000, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1000)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d_4 (Conv1D)           (None, 1000, 25)          150       
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 500, 25)          0         
 1D)                                                             
                                                                 
 flatten_4 (Flatten)         (None, 12500)             0         
                                                                 
 dense_8 (Dense)             (None, 16)                200016    
                                                                 
 dropout_4 (Dropout)         (None, 16)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 200,183
Trainable params: 200,183
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:01:05.652101
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:09.318222
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:04.343792
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:03.508179
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:03.181780
0:01:05.200815

CNN 1500 Words¶

In [4]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1500, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 1500, 25)          150       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 750, 25)          0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 18750)             0         
                                                                 
 dense (Dense)               (None, 16)                300016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 300,183
Trainable params: 300,183
Non-trainable params: 0
_________________________________________________________________
2022-05-01 16:54:50.148058: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.216819: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.217600: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.219647: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-01 16:54:50.220731: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.221504: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.222237: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.951252: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.951772: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.952257: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 16:54:50.953252: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21610 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:0b:00.0, compute capability: 8.6
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-01 16:55:40.988993: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-05-01 16:55:43.035191: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:01:29.472818
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:27.898696
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:26.490063
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:26.573140
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:26.244624
0:01:27.335868

CNN 2000 Words (Crashed)¶

In [ ]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(2000, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

# Compile model
modeal.compile(
     loss='binary_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 2000)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 2000, 25)          150       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1000, 25)         0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 25000)             0         
                                                                 
 dense (Dense)               (None, 16)                400016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 400,183
Trainable params: 400,183
Non-trainable params: 0
_________________________________________________________________
2022-05-01 17:13:17.368082: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:17.437223: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:17.438042: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:17.440176: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-01 17:13:17.441268: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:17.442057: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:17.442813: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:18.158809: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:18.159319: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:18.159786: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:13:18.160779: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21787 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:0b:00.0, compute capability: 8.6
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-01 17:14:13.063977: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-05-01 17:14:15.103869: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:01:48.219974
Fold 1
Test Index Start:40000
Test Set Size:40000

CNN 2 Layer 1500 Words¶

In [5]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1500, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Conv1D(12, 3, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(),    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
2022-05-01 17:19:45.808093: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:45.877391: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:45.878233: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:45.880865: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-01 17:19:45.881771: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:45.882415: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:45.883151: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:46.759043: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:46.759749: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:46.760420: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 17:19:46.761582: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21786 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:0b:00.0, compute capability: 8.6
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 1500, 25)          150       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 750, 25)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 750, 12)           912       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 375, 12)          0         
 1D)                                                             
                                                                 
 flatten (Flatten)           (None, 4500)              0         
                                                                 
 dense (Dense)               (None, 16)                72016     
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 73,095
Trainable params: 73,095
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-01 17:20:41.659740: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-05-01 17:20:43.807635: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:02:00.800542
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:02:03.279781
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:02:03.577406
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:02:02.920222
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:55.878413
0:02:01.291273

CNN 3 Layer 1500 Words¶

In [6]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(32, 5, padding="same", activation="relu", input_shape=(1500, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Conv1D(16, 5, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(),    
    tf.keras.layers.Conv1D(18, 5, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(), 
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d_2 (Conv1D)           (None, 1500, 32)          192       
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 750, 32)          0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 750, 16)           2576      
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 375, 16)          0         
 1D)                                                             
                                                                 
 conv1d_4 (Conv1D)           (None, 375, 18)           1458      
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 187, 18)          0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 3366)              0         
                                                                 
 dense_2 (Dense)             (None, 16)                53872     
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 58,115
Trainable params: 58,115
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:02:27.729023
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:02:29.786910
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:02:26.797031
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:02:26.497980
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:02:26.772808
0:02:27.516750

CNN 2 Layer Adam Optimizer 1500 Words lr=0.0001¶

In [6]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1500, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Conv1D(12, 3, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(),    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer=opt,
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 1500, 25)          150       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 750, 25)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 750, 12)           912       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 375, 12)          0         
 1D)                                                             
                                                                 
 flatten (Flatten)           (None, 4500)              0         
                                                                 
 dense_2 (Dense)             (None, 16)                72016     
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 73,095
Trainable params: 73,095
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-01 18:50:32.167602: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-05-01 18:50:34.362391: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:02:00.082564
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:56.349786
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:59.289046
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:50.526270
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:50.574102
0:01:55.364354

CNN 2 Layer SGD Optimizer 1500 Words lr=0.01 momentum=0.5¶

In [4]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1000, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Conv1D(12, 3, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(),    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])
opt = SGD(learning_rate=0.01, momentum=0.5)

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer=opt,
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1000)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256, 50)
2022-05-02 00:51:45.292719: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:45.361099: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:45.361799: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:45.363456: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-02 00:51:45.364539: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:45.365315: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:45.365910: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:46.075948: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:46.076583: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:46.077188: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 00:51:46.078295: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21793 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:0b:00.0, compute capability: 8.6
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 1000, 25)          150       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 500, 25)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 500, 12)           912       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 250, 12)          0         
 1D)                                                             
                                                                 
 flatten (Flatten)           (None, 3000)              0         
                                                                 
 dense (Dense)               (None, 16)                48016     
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 49,095
Trainable params: 49,095
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-02 00:52:41.023864: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-05-02 00:52:43.101416: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:02:42.912876
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:02:39.293006
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:02:39.649271
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:02:40.378114
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:02:39.249902
0:02:40.296634

CNN 2 Layer RMSprop Optimizer 1500 Words lr=0.0001¶

In [4]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1500, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Conv1D(12, 3, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(),    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])
opt = RMSprop(learning_rate=0.0001)

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer=opt,
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="tfidf")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
2022-05-01 19:33:31.836060: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 19:33:31.904550: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 19:33:31.905363: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 1500, 25)          150       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 750, 25)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 750, 12)           912       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 375, 12)          0         
 1D)                                                             
                                                                 
 flatten (Flatten)           (None, 4500)              0         
                                                                 
 dense (Dense)               (None, 16)                72016     
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 73,095
Trainable params: 73,095
Non-trainable params: 0
_________________________________________________________________
2022-05-01 19:33:31.907570: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-01 19:33:31.908619: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 19:33:31.909407: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 19:33:31.910165: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 19:33:32.640376: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 19:33:32.640907: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 19:33:32.641383: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-01 19:33:32.642374: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21717 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:0b:00.0, compute capability: 8.6
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-01 19:34:23.250389: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-05-01 19:34:25.307571: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
fold time:  0:01:54.395106
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:51.328465
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:51.481103
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:51.402162
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:51.386578
0:01:51.998683

Testing Different Ways of Tokenizing Tweets into Matrix¶

Binary¶

In [5]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1500, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Conv1D(12, 3, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(),    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer=opt,
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="binary")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 1500, 25)          150       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 750, 25)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 750, 12)           912       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 375, 12)          0         
 1D)                                                             
                                                                 
 flatten (Flatten)           (None, 4500)              0         
                                                                 
 dense_4 (Dense)             (None, 16)                72016     
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 73,095
Trainable params: 73,095
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-02 01:45:29.801537: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
fold time:  0:01:50.539248
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:02:00.462754
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:49.002432
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:47.751216
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:51.766371
0:01:51.904404

Count¶

In [6]:
t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="count")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:01:48.242117
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:47.381723
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:47.105621
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:47.489196
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:47.267659
0:01:47.497263

Frequency¶

In [7]:
t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="freq")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Fold 0
Test Index Start:0
Test Set Size:40000
fold time:  0:01:48.640231
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:48.735265
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:48.615698
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:48.439335
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:48.119685
0:01:48.510043

Testing Best Model With Lemmatized Words¶

In [5]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1500, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Conv1D(12, 3, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(),    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

from tensorflow.keras.optimizers import Adam, SGD, RMSprop
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer=opt,
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1500)
t.fit_on_texts(processed_df['word_tokens_no_stop_lemmatized'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_lemmatized'].apply(' '.join), mode="count")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:200000], pos_tweets_lemm[:200000], 256)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 1500, 25)          150       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 750, 25)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 750, 12)           912       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 375, 12)          0         
 1D)                                                             
                                                                 
 flatten (Flatten)           (None, 4500)              0         
                                                                 
 dense_2 (Dense)             (None, 16)                72016     
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 73,095
Trainable params: 73,095
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:40000
2022-05-02 04:13:17.980794: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
fold time:  0:01:51.399084
Fold 1
Test Index Start:40000
Test Set Size:40000
fold time:  0:01:55.335044
Fold 2
Test Index Start:80000
Test Set Size:40000
fold time:  0:01:49.473792
Fold 3
Test Index Start:120000
Test Set Size:40000
fold time:  0:01:49.067554
Fold 4
Test Index Start:160000
Test Set Size:40000
fold time:  0:01:50.812833
0:01:51.217661

Testing best model with larger dataset¶

In [7]:
# Create model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(25, 5, padding="same", activation="relu", input_shape=(1000, 1)),
    tf.keras.layers.MaxPooling1D(),
    tf.keras.layers.Conv1D(12, 3, padding="same", activation="relu"),
    tf.keras.layers.MaxPooling1D(),    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

from tensorflow.keras.optimizers import Adam, SGD, RMSprop
opt = Adam(learning_rate=0.0001)

# Compile model
model.compile(
     loss='binary_crossentropy',
     optimizer=opt,
     metrics=['accuracy']
)
model.summary()

t = tf.keras.preprocessing.text.Tokenizer(num_words = 1000)
t.fit_on_texts(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join))
X = t.texts_to_matrix(processed_df['word_tokens_no_stop_stemmed'].apply(' '.join), mode="count")
neg_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==0])
pos_tweets_lemm = np.array([X[i] for i in range(len(y)) if y[i]==1])
kfold(model, neg_tweets_lemm[:400000], pos_tweets_lemm[:400000], 256)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d_2 (Conv1D)           (None, 1000, 25)          150       
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 500, 25)          0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 500, 12)           912       
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 250, 12)          0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 3000)              0         
                                                                 
 dense_4 (Dense)             (None, 16)                48016     
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 49,095
Trainable params: 49,095
Non-trainable params: 0
_________________________________________________________________
Fold 0
Test Index Start:0
Test Set Size:80000
fold time:  0:02:31.909580
Fold 1
Test Index Start:80000
Test Set Size:80000
fold time:  0:02:30.856979
Fold 2
Test Index Start:160000
Test Set Size:80000
fold time:  0:02:29.617464
Fold 3
Test Index Start:240000
Test Set Size:80000
fold time:  0:02:29.452412
Fold 4
Test Index Start:320000
Test Set Size:80000
fold time:  0:02:30.034329
0:02:30.374153